R as an Extendable Environment



Relational Databases



Advantages of Relational Databases in Data Science



Programming Interfaces



R DBI Standard



Use Cases



CTA Bus/Rail Aggregation

library(DBI)
library(RPostgreSQL)
# CONNECT
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host=config$pg_conn$host, dbname=config$pg_conn$name,
                  user=config$pg_conn$user, password=config$pg_conn$pwd, port=config$pg_conn$port)
# R AGGREGATION (MULTIPLE FUNCS)
# do.call(data.frame, 
#        aggregate(rides ~ route + routename, agg_csv,
#                  function(x) c(count=length(x), sum=sum(x), mean=mean(x),
#                                median=median(x), min=min(x), max=max(x))))
# POSTGRES AGGREGATION
sql <- 'SELECT rt.route_name, 
               COUNT(rd.rides) AS "count", 
               SUM(rd.rides) AS "sum", 
               AVG(rd.rides) AS "mean", 
               MEDIAN(rd.rides) AS "median",
               R_MEDIAN(rd.rides) AS "r_median",
               MIN(rd.rides) AS "min", 
               MAX(rd.rides) AS "max"
        FROM bus_routes rt
        INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
        GROUP BY rt.route_name
        ORDER BY SUM(rd.rides) DESC
        LIMIT 10'
agg_sql <- dbGetQuery(conn, sql)
knitr::kable(agg_sql)
route_name count sum mean median r_median min max
79th 6513 176238493 27059.50 27672 27672 7896 43081
Ashland 6513 156178845 23979.56 23676 23676 5570 45177
Chicago 6513 130620291 20055.32 21767 21767 4448 35893
Western 6513 129322319 19856.03 19553 19553 4985 37202
Cottage Grove 6513 128999766 19806.50 21181 21181 5489 31187
Belmont 6513 123065617 18895.38 20732 20732 3451 30025
Clark 6513 119459342 18341.68 19171 19171 3310 26896
King Drive 6513 119070638 18282.00 19402 19402 3993 29896
Halsted 6513 118284951 18161.36 19305 19305 2915 30605
Sheridan 6513 118178375 18145.00 18812 18812 2666 33236

Graphing

ggplot(agg_sql, aes(route_name, sum, fill=route_name)) + geom_col(position = "dodge") +
  labs(title="CTA Top 10 Bus Routes by Ridership", x="Year", y="Rides") +
  scale_y_continuous(expand = c(0, 0), label=comma, limit=c(0,2E8)) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Calculation + Aggregation

sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year", m.rail_line, SUM(m.rail_total)  AS rail_total
      FROM merge_rail m
      GROUP BY m."year", m.rail_line
      ORDER BY m.rail_line, m."year"'
  
agg_sql <- dbGetQuery(conn, sql)
knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
year rail_line rail_total
141 2015 red 67921829
61 2007 orange 12469954
111 2003 purple_exp 11049890
57 2003 orange 11005673
25 2007 brown 15158712
40 2004 green 10829726
28 2010 brown 18312314
73 2001 pink 5965504
115 2007 purple_exp 11685317
47 2011 green 14277530
106 2016 purple 2114111
158 2014 yellow 1497412
82 2010 pink 8872230
125 2017 purple_exp 14230789
162 2018 yellow 1124127
7 2007 blue 34996201
86 2014 pink 10363334
83 2011 pink 9453552
150 2006 yellow 1103805
72 2018 orange 11886597

Graphing

cta_palette <- c(blue="#00A1DE", brown="#62361B", green="#009B3A", orange="#F9461C", pink="#E27EA6",
                 purple="#522398", purple_exp="#8059BA", red="#C60C30", yellow="#F9E300")
ggplot(agg_sql, aes(year, rail_total, color=rail_line)) + 
  geom_line(stat="identity") + geom_point(stat="identity") +
  labs(title="CTA Rail Ridership By Year", x="Year", y="Rides") +
  scale_x_continuous("year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_color_manual(values = cta_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Modeling

CREATE MATERIALIZED VIEW Rail_Model_Data AS
    SELECT r.id, r.station_id, r.station_name, r.ride_date, r.day_type, r.rides AS raw, 
          (r.rides / COUNT(*) OVER(PARTITION BY r.station_id, r.ride_date)) AS rides,
          CASE 
               WHEN r.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN r.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN r.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN r.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN r.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,        
           REPLACE(REPLACE((regexp_split_to_array(s.location, '\s+'))[1], ',', ''), '(', '')::numeric AS latitude,
           REPLACE((regexp_split_to_array(s.location, '\s+'))[2], ')', '')::numeric AS longitude,
           s.rail_line, s.ada, s.direction,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
       (
        SELECT id, station_id, station_name, day_type, rides, ride_date, 
               ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
        FROM rail_rides
       )r
    INNER JOIN rail_stations s ON s.station_id = r.station_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = r.ride_date
    INNER JOIN gas_prices g ON g.gas_date = r.ride_date
    INNER JOIN weather_data w ON w.weather_date = r.ride_date
    ORDER BY r.ride_date, r.station_id;
    
REFRESH MATERIALIZED VIEW Rail_Model_Data;
rail_model_data <- dbGetQuery(conn, "SELECT * FROM rail_model_data")
head(rail_model_data)
model <- lm(rides ~ day_type + season + latitude + longitude + rail_line + 
                    ue_rate + gas_price + avg_temp + precipitation + snow_depth, 
            data = rail_model_data)
summary(model)

Call:
lm(formula = rides ~ day_type + season + latitude + longitude + 
    rail_line + ue_rate + gas_price + avg_temp + precipitation + 
    snow_depth, data = rail_model_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6246.2  -935.6  -256.2   537.6 30563.4 

Coefficients:
                       Estimate  Std. Error  t value             Pr(>|t|)    
(Intercept)         -67333.2410   3539.2418  -19.025 < 0.0000000000000002 ***
day_typeU             -454.9352      6.6213  -68.707 < 0.0000000000000002 ***
day_typeW             1159.1548      5.2660  220.120 < 0.0000000000000002 ***
seasonspring          -180.2668      5.3604  -33.630 < 0.0000000000000002 ***
seasonsummer          -179.9820      6.5945  -27.293 < 0.0000000000000002 ***
seasonwinter          -160.1206      6.1885  -25.874 < 0.0000000000000002 ***
latitude             -3954.6687     36.5219 -108.282 < 0.0000000000000002 ***
longitude            -2679.7920     43.6084  -61.451 < 0.0000000000000002 ***
rail_linebrown       -1187.7668      7.2026 -164.909 < 0.0000000000000002 ***
rail_linegreen       -2076.7759      6.8344 -303.873 < 0.0000000000000002 ***
rail_lineorange      -1082.7685      8.2805 -130.762 < 0.0000000000000002 ***
rail_linepink        -2215.7928      7.3581 -301.135 < 0.0000000000000002 ***
rail_linepurple      -2043.6363     11.1396 -183.457 < 0.0000000000000002 ***
rail_linepurple_exp  -1558.0431      7.6564 -203.495 < 0.0000000000000002 ***
rail_linered          1636.6617      6.9037  237.070 < 0.0000000000000002 ***
rail_lineyellow      -1418.1369     17.5558  -80.779 < 0.0000000000000002 ***
ue_rate                 -7.6397      0.9665   -7.904   0.0000000000000027 ***
gas_price              188.1598      2.4375   77.193 < 0.0000000000000002 ***
avg_temp                 5.1418      0.1658   31.005 < 0.0000000000000002 ***
precipitation          -84.0921      5.2553  -16.001 < 0.0000000000000002 ***
snow_depth               1.8487      1.2051    1.534                0.125    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1856 on 1038214 degrees of freedom
  (210765 observations deleted due to missingness)
Multiple R-squared:  0.4021,    Adjusted R-squared:  0.4021 
F-statistic: 3.491e+04 on 20 and 1038214 DF,  p-value: < 0.00000000000000022
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)
ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Rail Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-4000, 2000) + 
  scale_fill_manual(values = seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=45, vjust=0.95, hjust=0.95))

# DISCONNECT FROM DATABASE
dbDisconnect(conn)
[1] TRUE


Divvy Time and Distance Calculation

options(connectionObserver = NULL)
library(odbc)
conn <- dbConnect(odbc::odbc(),
                  instance = config$db2_conn$instance,
                  driver = config$db2_conn$driver,
                  database = config$db2_conn$database,
                  server = config$db2_conn$server,
                  hostname = config$db2_conn$hostname,
                  port = config$db2_conn$port,
                  uid = config$db2_conn$uid,
                  pwd = config$db2_conn$pwd,
                  LongDataCompat = 1)
day_df <- dbGetQuery(conn, "SELECT t.START_TIME, t.STOP_TIME,
                                   t.TRIP_DURATION
                            FROM TRIPS t
                            WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD')
                            ORDER BY t.START_TIME")
knitr::kable(head(day_df, 20))
START_TIME STOP_TIME TRIP_DURATION
2015-09-23 00:00:00 2015-09-23 00:07:00 414
2015-09-23 00:00:00 2015-09-23 00:03:00 183
2015-09-23 00:01:00 2015-09-23 00:09:00 450
2015-09-23 00:01:00 2015-09-23 00:24:00 1350
2015-09-23 00:01:00 2015-09-23 00:07:00 402
2015-09-23 00:02:00 2015-09-23 00:15:00 793
2015-09-23 00:03:00 2015-09-23 00:27:00 1435
2015-09-23 00:04:00 2015-09-23 00:06:00 134
2015-09-23 00:05:00 2015-09-23 00:19:00 820
2015-09-23 00:06:00 2015-09-23 00:15:00 526
2015-09-23 00:07:00 2015-09-23 00:15:00 506
2015-09-23 00:08:00 2015-09-23 00:15:00 425
2015-09-23 00:09:00 2015-09-23 00:32:00 1372
2015-09-23 00:10:00 2015-09-23 00:24:00 881
2015-09-23 00:10:00 2015-09-23 00:35:00 1445
2015-09-23 00:10:00 2015-09-23 00:26:00 935
2015-09-23 00:11:00 2015-09-23 00:23:00 717
2015-09-23 00:13:00 2015-09-23 00:19:00 382
2015-09-23 00:14:00 2015-09-23 00:15:00 66
2015-09-23 00:16:00 2015-09-23 00:38:00 1316
ggplot(day_df, aes(START_TIME, TRIP_DURATION)) + geom_line(color=seaborn_palette[1]) +
  xlab("Start Time") + ylab("Trip Duration")

agg_sql <- dbGetQuery(conn, "WITH sub AS 
                                (SELECT t.FROM_STATION_ID, t.FROM_STATION_NAME,
                                        t.FROM_LATITUDE, t.FROM_LONGITUDE,
                                        ROUND(CASE 
                                                   WHEN t.TO_LATITUDE IS NOT NULL AND t.FROM_LATITUDE IS NOT NULL
                                                   THEN
                                                        3963.1 * (2 * ATAN(1) - 
                                                                  ASIN(SIN(RADIANS(t.FROM_LATITUDE)) * SIN(RADIANS(t.TO_LONGITUDE)) + 
                                                                       COS(RADIANS(t.FROM_LATITUDE)) * COS(RADIANS(t.TO_LONGITUDE)) * 
                                                                       COS(RADIANS(t.TO_LONGITUDE - t.FROM_LONGITUDE))
                                                                      ) 
                                                                 )
                                                   ELSE NULL
                                              END, 4) AS TRIP_DISTANCE
                                 FROM TRIPS t
                                 WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD'))
                            
                            SELECT FROM_STATION_ID, FROM_STATION_NAME, 
                                   FROM_LATITUDE, FROM_LONGITUDE,
                                   MIN(TRIP_DISTANCE) AS MIN_DISTANCE,
                                   MAX(TRIP_DISTANCE) AS MAX_DISTANCE
                            
                            FROM sub 
                            GROUP BY FROM_STATION_ID, FROM_STATION_NAME, 
                                     FROM_LATITUDE, FROM_LONGITUDE
                            ORDER BY MAX(TRIP_DISTANCE) DESC
                            FETCH FIRST 10 ROWS ONLY;")
knitr::kable(agg_sql)
FROM_STATION_ID FROM_STATION_NAME FROM_LATITUDE FROM_LONGITUDE MIN_DISTANCE MAX_DISTANCE
467 Western Ave & Lunt Ave 42.00859 -87.69049 8969.148 8971.314
494 Kedzie Ave & Bryn Mawr Ave 41.98240 -87.70892 8969.882 8971.135
447 Glenwood Ave & Morse Ave 42.00797 -87.66550 8967.513 8971.132
466 Ridge Blvd & Touhy Ave 42.01213 -87.68291 8969.393 8970.895
294 Broadway & Berwyn Ave 41.97835 -87.65975 8963.826 8970.855
479 Drake Ave & Montrose Ave 41.96115 -87.71657 8969.369 8970.804
469 St. Louis Ave & Balmoral Ave 41.98039 -87.71611 8967.694 8970.700
450 Warren Park West 42.00201 -87.68973 8970.580 8970.580
451 Sheridan Rd & Loyola Ave 42.00104 -87.66120 8966.526 8970.579
325 Clark St & Winnemac Ave 41.97339 -87.66836 8964.613 8970.512

Mapping

divvy_from <- dbGetQuery(conn, "SELECT t.FROM_STATION_NAME, 
                                       t.FROM_LATITUDE, 
                                       t.FROM_LONGITUDE,
                                       SUM(t.TRIP_DURATION) AS Total_Duration,
                                       MIN(t.TRIP_DURATION) AS Min_Duration,
                                       AVG(t.TRIP_DURATION) AS Avg_Duration,
                                       MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                       MAX(t.TRIP_DURATION) AS Max_Duration
                                       
                                FROM TRIPS t
                                GROUP BY t.FROM_STATION_NAME,
                                         t.FROM_LATITUDE, 
                                         t.FROM_LONGITUDE
                                ORDER BY SUM(t.TRIP_DURATION) DESC
                                FETCH FIRST 10 ROWS ONLY;")
knitr::kable(divvy_from)
FROM_STATION_NAME FROM_LATITUDE FROM_LONGITUDE TOTAL_DURATION MIN_DURATION AVG_DURATION MEDIAN_DURATION MAX_DURATION
Lake Shore Dr & Monroe St 41.88096 -87.61674 444455108 60 1803 1347 6403880
Streeter Dr & Grand Ave 41.89228 -87.61204 431638473 60 1956 1417 5444590
Michigan Ave & Oak St 41.90096 -87.62378 351053592 60 1815 1300 2549130
Millennium Park 41.88103 -87.62408 347331136 60 1820 1241 6785180
Theater on the Lake 41.92628 -87.63083 330660276 60 1524 1285 740561
Lake Shore Dr & North Blvd 41.91172 -87.62680 286215561 60 1452 1173 336578
Streeter Dr & Illinois St 41.89107 -87.61220 222454638 60 1615 1320 84073
Canal St & Adams St 41.87926 -87.63990 184661822 60 837 614 9961130
Michigan Ave & Washington St 41.88389 -87.62465 179427697 60 1160 649 1876990
Shedd Aquarium 41.86722 -87.61535 178996820 60 1675 1307 1389890
ggplot(transform(divvy_from, FROM_STATION_NAME=gsub("&", "\n&", FROM_STATION_NAME)),
                 aes(FROM_STATION_NAME, TOTAL_DURATION, fill=FROM_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Origination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

divvy_to <- dbGetQuery(conn, "SELECT t.TO_STATION_NAME, 
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE,
                                     SUM(t.TRIP_DURATION) AS Total_Duration,
                                     MIN(t.TRIP_DURATION) AS Min_Duration,
                                     AVG(t.TRIP_DURATION) AS Avg_Duration,
                                     MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                     MAX(t.TRIP_DURATION) AS Max_Duration
                                   
                            FROM TRIPS t
                            WHERE TO_STATION_NAME <> 'DIVVY Map Frame B/C Station'
                            GROUP BY t.TO_STATION_NAME,
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE
                            ORDER BY SUM(t.TRIP_DURATION) DESC
                            FETCH FIRST 10 ROWS ONLY;")
knitr::kable(divvy_to)
TO_STATION_NAME TO_LATITUDE TO_LONGITUDE TOTAL_DURATION MIN_DURATION AVG_DURATION MEDIAN_DURATION MAX_DURATION
Streeter Dr & Grand Ave 41.89228 -87.61204 469992965 60 1914 1432 297905
Lake Shore Dr & Monroe St 41.88096 -87.61674 396754905 60 1702 1318 731920
Theater on the Lake 41.92628 -87.63083 370722559 60 1598 1374 383963
Michigan Ave & Oak St 41.90096 -87.62378 369016876 60 1782 1306 2549130
Millennium Park 41.88103 -87.62408 344723608 60 1637 1157 566401
Lake Shore Dr & North Blvd 41.91172 -87.62680 337381528 60 1524 1295 269506
Streeter Dr & Illinois St 41.89107 -87.61220 264397622 60 1611 1325 85368
Michigan Ave & Washington St 41.88389 -87.62465 158901379 60 1003 572 3054250
Shedd Aquarium 41.86722 -87.61535 158356779 60 1563 1307 184015
Adler Planetarium 41.86610 -87.60727 153361126 60 1635 1357 524315
ggplot(transform(divvy_to, TO_STATION_NAME=gsub("&", "\n&", TO_STATION_NAME)), 
       aes(TO_STATION_NAME, TOTAL_DURATION, fill=TO_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Destination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

library(jsonlite)
x <- toJSON(divvy_from[c("FROM_STATION_NAME", "FROM_LATITUDE", "FROM_LONGITUDE")], pretty=TRUE)
# EXPORT TO FILE
fileConn <- file("Divvy_From_Coords.json")
writeLines(x, fileConn)
close(fileConn)
x
[
  {
    "FROM_STATION_NAME": "Lake Shore Dr & Monroe St",
    "FROM_LATITUDE": 41.881,
    "FROM_LONGITUDE": -87.6167
  },
  {
    "FROM_STATION_NAME": "Streeter Dr & Grand Ave",
    "FROM_LATITUDE": 41.8923,
    "FROM_LONGITUDE": -87.612
  },
  {
    "FROM_STATION_NAME": "Michigan Ave & Oak St",
    "FROM_LATITUDE": 41.901,
    "FROM_LONGITUDE": -87.6238
  },
  {
    "FROM_STATION_NAME": "Millennium Park",
    "FROM_LATITUDE": 41.881,
    "FROM_LONGITUDE": -87.6241
  },
  {
    "FROM_STATION_NAME": "Theater on the Lake",
    "FROM_LATITUDE": 41.9263,
    "FROM_LONGITUDE": -87.6308
  },
  {
    "FROM_STATION_NAME": "Lake Shore Dr & North Blvd",
    "FROM_LATITUDE": 41.9117,
    "FROM_LONGITUDE": -87.6268
  },
  {
    "FROM_STATION_NAME": "Streeter Dr & Illinois St",
    "FROM_LATITUDE": 41.8911,
    "FROM_LONGITUDE": -87.6122
  },
  {
    "FROM_STATION_NAME": "Canal St & Adams St",
    "FROM_LATITUDE": 41.8793,
    "FROM_LONGITUDE": -87.6399
  },
  {
    "FROM_STATION_NAME": "Michigan Ave & Washington St",
    "FROM_LATITUDE": 41.8839,
    "FROM_LONGITUDE": -87.6246
  },
  {
    "FROM_STATION_NAME": "Shedd Aquarium",
    "FROM_LATITUDE": 41.8672,
    "FROM_LONGITUDE": -87.6154
  }
] 
x <- toJSON(divvy_to[c("TO_STATION_NAME", "TO_LATITUDE", "TO_LONGITUDE")], pretty=TRUE)
# EXPORT TO FILE
fileConn <- file("Divvy_To_Coords.json")
writeLines(x, fileConn)
close(fileConn)
x
[
  {
    "TO_STATION_NAME": "Streeter Dr & Grand Ave",
    "TO_LATITUDE": 41.8923,
    "TO_LONGITUDE": -87.612
  },
  {
    "TO_STATION_NAME": "Lake Shore Dr & Monroe St",
    "TO_LATITUDE": 41.881,
    "TO_LONGITUDE": -87.6167
  },
  {
    "TO_STATION_NAME": "Theater on the Lake",
    "TO_LATITUDE": 41.9263,
    "TO_LONGITUDE": -87.6308
  },
  {
    "TO_STATION_NAME": "Michigan Ave & Oak St",
    "TO_LATITUDE": 41.901,
    "TO_LONGITUDE": -87.6238
  },
  {
    "TO_STATION_NAME": "Millennium Park",
    "TO_LATITUDE": 41.881,
    "TO_LONGITUDE": -87.6241
  },
  {
    "TO_STATION_NAME": "Lake Shore Dr & North Blvd",
    "TO_LATITUDE": 41.9117,
    "TO_LONGITUDE": -87.6268
  },
  {
    "TO_STATION_NAME": "Streeter Dr & Illinois St",
    "TO_LATITUDE": 41.8911,
    "TO_LONGITUDE": -87.6122
  },
  {
    "TO_STATION_NAME": "Michigan Ave & Washington St",
    "TO_LATITUDE": 41.8839,
    "TO_LONGITUDE": -87.6246
  },
  {
    "TO_STATION_NAME": "Shedd Aquarium",
    "TO_LATITUDE": 41.8672,
    "TO_LONGITUDE": -87.6154
  },
  {
    "TO_STATION_NAME": "Adler Planetarium",
    "TO_LATITUDE": 41.8661,
    "TO_LONGITUDE": -87.6073
  }
] 

# DISCONNECT FROM DATABASE
dbDisconnect(conn)


Metra Data Normalization and Testing

library(RSQLite)
conn <- dbConnect(RSQLite::SQLite(), dbname=config$sqlite_conn$database)

By Year

sql <- "SELECT l.Line, 
               t.Line AS Short,
               CAST(strftime('%Y', Report_Month) AS INT) as Year,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.Line,
                 t.Line,
                 strftime('%Y', t.Report_Month)"
agg_sql <- dbGetQuery(conn, sql)
knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
Line Short Year Total_Late_Trains Avg_Late_Trains
76 North Central Service NCS 2014 1698 35.375000
23 Metra Electric Blue Island Elec-SC 2011 1245 17.291667
47 Metra Electric South Chicago Elec-BI 2015 552 10.036364
20 Heritage Corridor Heritage 2018 567 11.812500
22 Metra Electric Blue Island Elec-SC 2010 692 9.611111
116 Union Pacific / Northwest UP-NW 2014 2562 35.583333
65 Milwaukee District / West Milw-W 2013 2795 38.819444
102 Union Pacific / North UP-N 2010 2833 39.347222
78 North Central Service NCS 2016 921 19.187500
95 SouthWest Service SWS 2013 1035 17.250000
39 Metra Electric Main Line Elec-ML 2017 1377 19.125000
121 Union Pacific / West UP-W 2009 2021 28.069444
48 Metra Electric South Chicago Elec-BI 2016 419 6.983333
118 Union Pacific / Northwest UP-NW 2016 1910 26.527778
84 Rock Island District RI 2012 2594 36.027778
56 Milwaukee District / North Milw-N 2014 4192 58.222222
73 North Central Service NCS 2011 1500 31.250000
19 Heritage Corridor Heritage 2017 363 7.562500
36 Metra Electric Main Line Elec-ML 2014 2032 28.222222
26 Metra Electric Blue Island Elec-SC 2014 973 13.513889
ggplot(agg_sql, aes(Year, Total_Late_Trains, color=Line)) + 
  geom_line(stat="identity") +
  labs(title="Metra Late Trains by Line", x="Year", y="Late Trains") +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_color_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Statistical Testing

T-Tests

lines_list <- split(agg_sql, agg_sql$Short)
nms <- lapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) paste(i, collapse="_"))
res <- t(sapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) {
  t <- t.test(lines_list[[i[1]]]$Total_Late_Trains, lines_list[[i[2]]]$Total_Late_Trains)
  c(statistic = t$statistic, p_value = t$p.value)
}))
row.names(res) <- nms
knitr::kable(res)
statistic.t p_value
BNSF_Heritage 9.8494756 0.0000034
BNSF_Elec-SC 8.7158417 0.0000076
BNSF_Elec-ML 6.1374028 0.0000957
BNSF_Elec-BI 9.3920454 0.0000054
BNSF_Milw-N 2.8910097 0.0111757
BNSF_Milw-W 4.9595192 0.0002727
BNSF_NCS 7.9789334 0.0000143
BNSF_RI 4.5415218 0.0006351
BNSF_SWS 7.5383993 0.0000207
BNSF_UP-N 4.9702928 0.0001637
BNSF_UP-NW 5.4107001 0.0002478
BNSF_UP-W 3.7401929 0.0021030
Heritage_Elec-SC -5.2621722 0.0000918
Heritage_Elec-ML -12.0629835 0.0000001
Heritage_Elec-BI -3.4987332 0.0027396
Heritage_Milw-N -10.3277589 0.0000017
Heritage_Milw-W -9.1513465 0.0000037
Heritage_NCS -7.7996898 0.0000020
Heritage_RI -11.0240760 0.0000006
Heritage_SWS -8.6861313 0.0000010
Heritage_UP-N -6.3532760 0.0001028
Heritage_UP-NW -13.6677426 0.0000000
Heritage_UP-W -9.5307864 0.0000033
Elec-SC_Elec-ML -7.6930971 0.0000013
Elec-SC_Elec-BI 3.0553272 0.0090725
Elec-SC_Milw-N -8.4524134 0.0000058
Elec-SC_Milw-W -6.7021299 0.0000291
Elec-SC_NCS -2.6856024 0.0153183
Elec-SC_RI -8.3005633 0.0000030
Elec-SC_SWS -3.9255453 0.0011167
Elec-SC_UP-N -4.5812945 0.0009268
Elec-SC_UP-NW -9.4434609 0.0000001
Elec-SC_UP-W -7.5259529 0.0000146
Elec-ML_Elec-BI 10.7051161 0.0000005
Elec-ML_Milw-N -4.3660738 0.0008571
Elec-ML_Milw-W -1.6151844 0.1279065
Elec-ML_NCS 5.2419772 0.0000733
Elec-ML_RI -2.7359672 0.0152246
Elec-ML_SWS 3.8518590 0.0012327
Elec-ML_UP-N -0.7261232 0.4813509
Elec-ML_UP-NW -1.8073859 0.0875269
Elec-ML_UP-W -3.2060881 0.0069770
Elec-BI_Milw-N -9.6182342 0.0000038
Elec-BI_Milw-W -8.2352811 0.0000119
Elec-BI_NCS -5.9428602 0.0000643
Elec-BI_RI -10.0649428 0.0000019
Elec-BI_SWS -7.0205384 0.0000176
Elec-BI_UP-N -5.6282342 0.0002825
Elec-BI_UP-NW -12.4136552 0.0000001
Elec-BI_UP-W -8.7676538 0.0000080
Milw-N_Milw-W 2.6700024 0.0162767
Milw-N_NCS 7.2630725 0.0000175
Milw-N_RI 2.0467572 0.0573187
Milw-N_SWS 6.5511394 0.0000355
Milw-N_UP-N 2.7933483 0.0120074
Milw-N_UP-NW 3.2584554 0.0063262
Milw-N_UP-W 1.0801680 0.2944043
Milw-W_NCS 5.1854922 0.0002195
Milw-W_RI -0.8225248 0.4216052
Milw-W_SWS 4.2973444 0.0008763
Milw-W_UP-N 0.4888439 0.6313060
Milw-W_UP-NW 0.2804970 0.7829139
Milw-W_UP-W -1.5653195 0.1356000
NCS_RI -6.6347574 0.0000189
NCS_SWS -1.3501689 0.1939515
NCS_UP-N -3.4515401 0.0055512
NCS_UP-NW -7.0452631 0.0000029
NCS_UP-W -6.2606259 0.0000586
RI_SWS 5.6476491 0.0000687
RI_UP-N 1.1936645 0.2499458
RI_UP-NW 1.3014342 0.2118233
RI_UP-W -0.8818209 0.3903462
SWS_UP-N -2.7957833 0.0169999
SWS_UP-NW -5.6470712 0.0000295
SWS_UP-W -5.5094534 0.0001466
UP-N_UP-NW -0.3290665 0.7474460
UP-N_UP-W -1.8187654 0.0857421
UP-NW_UP-W -2.0429687 0.0613004

By Month

sql <- "SELECT l.ID,
               l.Line, 
               t.Line AS Short,
               CAST(strftime('%m', Report_Month) AS INT) as Month_Num,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.ID,
                 l.Line,
                 t.Line,
                 CAST(strftime('%m', Report_Month) AS INT)
        ORDER BY l.ID,
                 CAST(strftime('%m', Report_Month) AS INT)"
agg_sql <- transform(dbGetQuery(conn, sql), Month = factor(month.abb[Month_Num], levels=month.abb))
knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
ID Line Short Month_Num Total_Late_Trains Avg_Late_Trains Month
6 1 BNSF Railway BNSF 6 5987 99.78333 Jun
83 7 Milwaukee District / West Milw-W 11 1888 31.46667 Nov
148 13 Union Pacific / West UP-W 4 1914 31.90000 Apr
39 4 Metra Electric South Chicago Elec-BI 3 329 6.58000 Mar
150 13 Union Pacific / West UP-W 6 3043 50.71667 Jun
107 9 Rock Island District RI 11 1940 32.33333 Nov
44 4 Metra Electric South Chicago Elec-BI 8 379 7.58000 Aug
126 11 Union Pacific / North UP-N 6 2321 38.68333 Jun
27 3 Metra Electric Main Line Elec-ML 3 1202 20.03333 Mar
86 8 North Central Service NCS 2 1368 34.20000 Feb
55 5 Metra Electric Blue Island Elec-SC 7 1204 20.06667 Jul
129 11 Union Pacific / North UP-N 9 1724 28.73333 Sep
77 7 Milwaukee District / West Milw-W 5 1454 24.23333 May
41 4 Metra Electric South Chicago Elec-BI 5 312 6.24000 May
8 1 BNSF Railway BNSF 8 4341 72.35000 Aug
121 11 Union Pacific / North UP-N 1 1893 31.55000 Jan
113 10 SouthWest Service SWS 5 1155 23.10000 May
115 10 SouthWest Service SWS 7 1186 23.25490 Jul
89 8 North Central Service NCS 5 852 21.30000 May
152 13 Union Pacific / West UP-W 8 2792 46.53333 Aug
res <- by(agg_sql, agg_sql$Line, function(sub)
  print(ggplot(sub, aes(Month, Total_Late_Trains, fill=Line)) + 
    geom_col(position = "dodge") +
    labs(title=paste(sub$Line, "-\nLate Trains by Month"), x="Year", y="Late Trains") +
    scale_x_discrete("Month", breaks=unique(agg_sql$Month)) +
    scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
    scale_fill_manual(values=seaborn_palette[sub$ID[1]]) +
    theme(legend.position="bottom",
          plot.title = element_text(hjust=0.5, size=18),
          axis.text.x = element_text(angle=0, hjust=0.5)))
)

Correlations

sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               SUM(CASE WHEN c.Late_Cause = 'Accident' THEN c.Late_Trains ELSE NULL END) AS [Accident],
               SUM(CASE WHEN c.Late_Cause = 'Catenary Failure' THEN c.Late_Trains ELSE NULL END) AS [Catenary],               
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Peak],
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Off-Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Off Peak],  
               SUM(CASE WHEN c.Late_Cause = 'Human Error' THEN c.Late_Trains ELSE NULL END) AS [Human Error],                     
               SUM(CASE WHEN c.Late_Cause = 'Lift Deployment' THEN c.Late_Trains ELSE NULL END) AS [Lift Deployment],                
               SUM(CASE WHEN c.Late_Cause = 'Locomotive Failure' THEN c.Late_Trains ELSE NULL END) AS [Locomotive Failure],              
               SUM(CASE WHEN c.Late_Cause = 'Non-Locomotive Equipment Failure' THEN c.Late_Trains ELSE NULL END) AS [Non-Locomotive Equipment Failure], 
               SUM(CASE WHEN c.Late_Cause = 'Obstruction/Debris' THEN c.Late_Trains ELSE NULL END) AS [Obstruction_Debris],              
               SUM(CASE WHEN c.Late_Cause = 'Other' THEN c.Late_Trains ELSE NULL END) AS [Other],                            
               SUM(CASE WHEN c.Late_Cause = 'Passenger Loading' THEN c.Late_Trains ELSE NULL END) AS [Passenger Loading],
               SUM(CASE WHEN c.Late_Cause = 'Passenger Train Interference' THEN c.Late_Trains ELSE NULL END) AS [Passenger Train Interference],     
               SUM(CASE WHEN c.Late_Cause = 'Sick, Injured, Unruly Passenger' THEN c.Late_Trains ELSE NULL END) AS [Sick Injured Unruly Passenger],
               SUM(CASE WHEN c.Late_Cause = 'Signal/Switch Failure' THEN c.Late_Trains ELSE NULL END) AS [Signal Switch Failure],           
               SUM(CASE WHEN c.Late_Cause = 'Track Work' THEN c.Late_Trains ELSE NULL END) AS [Track Work],                      
               SUM(CASE WHEN c.Late_Cause = 'Weather' THEN c.Late_Trains ELSE NULL END) AS [Weather]            FROM Causes c 
      GROUP BY CAST(strftime('%Y', c.Report_Month) AS INT);"
wide_sql <- dbGetQuery(conn, sql)
knitr::kable(wide_sql)
Year Accident Catenary Freight Interference Peak Freight Interference Off Peak Human Error Lift Deployment Locomotive Failure Non-Locomotive Equipment Failure Obstruction_Debris Other Passenger Loading Passenger Train Interference Sick Injured Unruly Passenger Signal Switch Failure Track Work Weather
2009 522 116 688 1040 1058 510 1202 402 798 538 2736 608 788 2798 1616 2150
2010 1600 196 1754 3054 2212 1082 2476 1144 1468 1158 4130 1484 1670 5290 2746 2946
2011 1338 80 990 2272 1740 902 1320 486 802 792 4290 988 1000 3296 2758 3094
2012 932 162 496 1474 1294 500 1086 326 848 638 2364 440 874 2506 1806 1262
2013 1150 338 642 1706 1508 410 1202 468 782 526 2276 404 768 3274 1328 2194
2014 1370 144 1336 2334 1470 428 1658 828 1100 568 1509 490 732 2536 1964 3322
2015 898 302 726 1360 1318 330 NA NA 870 488 1084 260 574 1770 1188 1950
2016 1088 200 618 1066 1270 290 NA NA 892 484 1082 308 814 2782 2000 1108
2017 1268 66 718 1294 1796 496 NA NA 1090 640 1162 266 712 2446 1896 1194
2018 814 154 1112 1916 1990 640 NA NA 1230 562 1324 444 776 3478 1570 2108
knitr::kable(cor(wide_sql[-1], use = "complete.obs", method="pearson"))
Accident Catenary Freight Interference Peak Freight Interference Off Peak Human Error Lift Deployment Locomotive Failure Non-Locomotive Equipment Failure Obstruction_Debris Other Passenger Loading Passenger Train Interference Sick Injured Unruly Passenger Signal Switch Failure Track Work Weather
Accident 1.0000000 0.1435415 0.7806773 0.9608999 0.8976619 0.5824678 0.7215697 0.7642611 0.6828030 0.6668736 0.3463388 0.5872245 0.5954141 0.6114195 0.6652174 0.6677474
Catenary 0.1435415 1.0000000 -0.1255482 0.0037574 0.1328661 -0.2913271 0.0296204 0.0639429 0.0278461 -0.1184067 -0.2985693 -0.2269049 -0.0105223 0.2158549 -0.5200273 -0.2466628
Freight Interference Peak 0.7806773 -0.1255482 1.0000000 0.9091988 0.8020245 0.6661686 0.9547198 0.9741564 0.9170038 0.7587071 0.3725510 0.7711136 0.7310978 0.7209778 0.7028399 0.7798693
Freight Interference Off Peak 0.9608999 0.0037574 0.9091988 1.0000000 0.9399653 0.7224539 0.8670727 0.8811132 0.8227255 0.8051196 0.4622986 0.7564155 0.7459639 0.7342568 0.7756815 0.7157045
Human Error 0.8976619 0.1328661 0.8020245 0.9399653 1.0000000 0.8356280 0.8342163 0.7914287 0.7573302 0.8979099 0.6539403 0.8494920 0.8652815 0.8833295 0.7699593 0.5505740
Lift Deployment 0.5824678 -0.2913271 0.6661686 0.7224539 0.8356280 1.0000000 0.7013289 0.5799427 0.5998879 0.9445498 0.9267498 0.9752919 0.9003905 0.8317744 0.9168026 0.4295133
Locomotive Failure 0.7215697 0.0296204 0.9547198 0.8670727 0.8342163 0.7013289 1.0000000 0.9781492 0.9757232 0.8446837 0.4127356 0.8198952 0.8583588 0.8499444 0.6355407 0.5798980
Non-Locomotive Equipment Failure 0.7642611 0.0639429 0.9741564 0.8811132 0.7914287 0.5799427 0.9781492 1.0000000 0.9622677 0.7347822 0.2636146 0.7106868 0.7395528 0.7528201 0.5662177 0.6705569
Obstruction_Debris 0.6828030 0.0278461 0.9170038 0.8227255 0.7573302 0.5998879 0.9757232 0.9622677 1.0000000 0.7937764 0.2714363 0.7237340 0.8117931 0.7594222 0.5631391 0.4725877
Other 0.6668736 -0.1184067 0.7587071 0.8051196 0.8979099 0.9445498 0.8446837 0.7347822 0.7937764 1.0000000 0.7778087 0.9605269 0.9844827 0.9054438 0.8402201 0.3593215
Passenger Loading 0.3463388 -0.2985693 0.3725510 0.4622986 0.6539403 0.9267498 0.4127356 0.2636146 0.2714363 0.7778087 1.0000000 0.8556134 0.7359260 0.7011243 0.7977969 0.2684835
Passenger Train Interference 0.5872245 -0.2269049 0.7711136 0.7564155 0.8494920 0.9752919 0.8198952 0.7106868 0.7237340 0.9605269 0.8556134 1.0000000 0.9415782 0.8986873 0.8649865 0.4857091
Sick Injured Unruly Passenger 0.5954141 -0.0105223 0.7310978 0.7459639 0.8652815 0.9003905 0.8583588 0.7395528 0.8117931 0.9844827 0.7359260 0.9415782 1.0000000 0.9448665 0.7375805 0.2765033
Signal Switch Failure 0.6114195 0.2158549 0.7209778 0.7342568 0.8833295 0.8317744 0.8499444 0.7528201 0.7594222 0.9054438 0.7011243 0.8986873 0.9448665 1.0000000 0.6072113 0.3542022
Track Work 0.6652174 -0.5200273 0.7028399 0.7756815 0.7699593 0.9168026 0.6355407 0.5662177 0.5631391 0.8402201 0.7977969 0.8649865 0.7375805 0.6072113 1.0000000 0.5830179
Weather 0.6677474 -0.2466628 0.7798693 0.7157045 0.5505740 0.4295133 0.5798980 0.6705569 0.4725877 0.3593215 0.2684835 0.4857091 0.2765033 0.3542022 0.5830179 1.0000000
sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               c.Late_Cause,
               SUM(c.Late_Trains) AS Total_Late_Trains
        FROM Causes c
        INNER JOIN Lines l ON c.LineID = l.ID
        WHERE c.Late_Cause IN 
              (SELECT DISTINCT sub_c.Late_Cause 
               FROM Causes AS sub_c 
               WHERE strftime('%Y', sub_c.Report_Month) = '2012')
          AND c.Late_Cause <> 'TOTAL TRAINS DELAYED'
        GROUP BY strftime('%Y', c.Report_Month),
                 c.Late_Cause"
agg_sql <- dbGetQuery(conn, sql)
knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
Year Late_Cause Total_Late_Trains
59 2012 Locomotive Failure 1086
83 2013 Signal/Switch Failure 3274
9 2009 Non-Locomotive Equipment Failure 402
100 2014 Signal/Switch Failure 2536
94 2014 Non-Locomotive Equipment Failure 828
144 2017 Sick, Injured, Unruly Passenger 712
2 2009 Catenary Failure 116
146 2017 Track Work 1896
4 2009 Freight Interference - Peak 688
68 2012 Weather 1262
6 2009 Human Error 1058
104 2015 Catenary Failure 302
5 2009 Freight Interference - Total 1728
162 2018 Weather 2108
90 2014 Freight Interference - Total 3670
141 2017 Other 640
43 2011 Non-Locomotive Equipment Failure 486
157 2018 Passenger Loading 1324
29 2010 Passenger Loading 4130
105 2015 Freight Interference - Off-Peak 1360
ggplot(subset(agg_sql, Late_Cause != 'Freight Interference - Total'),
              aes(Year, Total_Late_Trains, fill=Late_Cause)) + 
  geom_bar(position = "fill", stat = "identity") +
  labs(title="Metra Late Trains by Cause", x="Year", y="Late Trains") +
  guides(fill=guide_legend(ncol=4)) +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

# DISCONNECT FROM DATABASE
dbDisconnect(conn)


Conclusion




---
title: "Leveraging Relational Databases in R"
author: "Parfait Gasana"
date: "April 27, 2019"
output: html_notebook
subtitle: SatRday Chicago 2019
---

<style type="text/css">
div.blue pre { background-color: #EBF4FA; }
.main-container {
  max-width: 1000px;
  margin-left: auto;
  margin-right: auto;
}
</style>

```{r setup, include=FALSE}
library(yaml)
library(ggplot2)
library(scales)

knitr::opts_chunk$set(echo = TRUE)
options(scipen=999)

config = yaml.load_file("dbconfig.yaml")

seaborn_palette <- c('#4878d0', '#ee854a', '#6acc64', '#d65f5f', '#956cb4', '#8c613c', 
                     '#dc7ec0', '#797979', '#d5bb67', '#82c6e2', '#4878d0', '#ee854a', 
                     '#6acc64', '#d65f5f', '#956cb4', '#8c613c', '#dc7ec0', '#797979', 
                     '#d5bb67', '#82c6e2', '#4878d0', '#ee854a', '#6acc64', '#d65f5f')

```

<br/>
<br/>

------

## R as an Extendable Environment

- ### I/O Interface: csv/txt, json, XML, HTML
- ### API Connections: httr, URL request, piping
- ### Remote Integrations: server, cloud computing

<center><img src="assets/R_Extendable_Environment.png" width="550px" /></center> 

------

<br/>

## Relational Databases
<center><img src="assets/RDBMS_Icons.png" width="550px" /></center>  

- ### Commercial or open source software designed to maintain and manage structured data
- ### Based on relational model of IBM's Edgar F. Codd
    - #### Data is stored in logically grouped, disaggregated tables within an interconnected system of relational keys
- ### Mainstay in software and web applications but not in data science

------

<br/>

## Advantages of Relational Databases in Data Science

- ### Data persistence: historical and current needs
- ### Data hygiene: adherence to types with less munging/cleaning
- ### Storage efficiency: normalization reduces repetition of data
- ### Centralization: multiple user environment and security
- ### Scalability: not limited to local resources

------

<br/>

## Programming Interfaces
<center><img src="assets/Languages_Icons.png" width="500px"/></center> 
  
- ### Most programming languages support RDBMS connections
- ### Some languages maintain consistent DB-API specifications and standards:
    - #### Java: JDBC; C++: SQLAPI++; NET: ODBC/OLEDB; PHP: PDO; Python: DBAPI; Perl, Ruby, R: DBI

------

<br/>

## R DBI Standard

<div style="float:right"><img src="assets/R_Database.jpg" /></div>

- ### General: RJDBC, odbc
    - #### Requires corresponding JDBC/ODBC drivers
- ### Specific: 
    - #### DBI standard - ROracle, RPostgreSQL, RMySQL, RSQLite

------

<br/>

## Use Cases 

<center><img src="assets/Database_Use_Cases.png" width="500px"/></center> 

- ### CTA Bus and Rail Ridership with PostgreSQL
- ### Divvy Trips with IBM DB2
- ### Metra On-Time Performance with SQLite

------

<br/>

## <span style="color:#336791">CTA Bus/Rail Aggregation</span>

<center><img src="assets/cta_logo.png" width="150px" /><img src="assets/postgresql_r.png" width="150px" /></center>

- ### <span style="color:#336791">Clear, compact declarative language with portability</span> ####
- ### <span style="color:#336791">Processing with virtual tables occurs behind the scene</span> ###
- ### <span style="color:#336791">Set-based framework facilitates blockwise, vectorized process</span> ###

```{r}
library(DBI)
library(RPostgreSQL)

# CONNECT
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host=config$pg_conn$host, dbname=config$pg_conn$name,
                  user=config$pg_conn$user, password=config$pg_conn$pwd, port=config$pg_conn$port)
```

```{r}
# R AGGREGATION (MULTIPLE FUNCS)
# do.call(data.frame, 
#        aggregate(rides ~ route + routename, agg_csv,
#                  function(x) c(count=length(x), sum=sum(x), mean=mean(x),
#                                median=median(x), min=min(x), max=max(x))))

# POSTGRES AGGREGATION
sql <- 'SELECT rt.route_name, 
               COUNT(rd.rides) AS "count", 
               SUM(rd.rides) AS "sum", 
               AVG(rd.rides) AS "mean", 
               MEDIAN(rd.rides) AS "median",
               R_MEDIAN(rd.rides) AS "r_median",
               MIN(rd.rides) AS "min", 
               MAX(rd.rides) AS "max"
        FROM bus_routes rt
        INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
        GROUP BY rt.route_name
        ORDER BY SUM(rd.rides) DESC
        LIMIT 10'

agg_sql <- dbGetQuery(conn, sql)

knitr::kable(agg_sql)
```

------


### Graphing
```{r fig1, fig.height = 5, fig.width = 10, fig.align = "center"}
ggplot(agg_sql, aes(route_name, sum, fill=route_name)) + geom_col(position = "dodge") +
  labs(title="CTA Top 10 Bus Routes by Ridership", x="Year", y="Rides") +
  scale_y_continuous(expand = c(0, 0), label=comma, limit=c(0,2E8)) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```

------


## <span style="color:#336791">Calculation + Aggregation</span>

- ### <span style="color:#336791">CTEs clearly show underlying tables and views without helper objects</span> ###
- ### <span style="color:#336791">Window functions allow useful inline calculations</span> ###
- ### <span style="color:#336791">Complex processing still readable and maintainable</span> ###

```{r}
sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year", m.rail_line, SUM(m.rail_total)  AS rail_total
      FROM merge_rail m
      GROUP BY m."year", m.rail_line
      ORDER BY m.rail_line, m."year"'
  
agg_sql <- dbGetQuery(conn, sql)

knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
```

------

### Graphing

```{r fig2, fig.height = 6, fig.width = 10, fig.align = "center"}

cta_palette <- c(blue="#00A1DE", brown="#62361B", green="#009B3A", orange="#F9461C", pink="#E27EA6",
                 purple="#522398", purple_exp="#8059BA", red="#C60C30", yellow="#F9E300")

ggplot(agg_sql, aes(year, rail_total, color=rail_line)) + 
  geom_line(stat="identity") + geom_point(stat="identity") +
  labs(title="CTA Rail Ridership By Year", x="Year", y="Rides") +
  scale_x_continuous("year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_color_manual(values = cta_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```

------

## <span style="color:#336791">Modeling</span>

- ### <span style="color:#336791">Advanced preparation of data</span>
- ### <span style="color:#336791">Materialized view facilitates reproducible research</span>
- ### <span style="color:#336791">Compact and straightforward data sourcing</span>


```{sql, eval=FALSE}
CREATE MATERIALIZED VIEW Rail_Model_Data AS
    SELECT r.id, r.station_id, r.station_name, r.ride_date, r.day_type, r.rides AS raw, 
          (r.rides / COUNT(*) OVER(PARTITION BY r.station_id, r.ride_date)) AS rides,
          CASE 
               WHEN r.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN r.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN r.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN r.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN r.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,        
           REPLACE(REPLACE((regexp_split_to_array(s.location, '\s+'))[1], ',', ''), '(', '')::numeric AS latitude,
           REPLACE((regexp_split_to_array(s.location, '\s+'))[2], ')', '')::numeric AS longitude,
           s.rail_line, s.ada, s.direction,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
       (
        SELECT id, station_id, station_name, day_type, rides, ride_date, 
               ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
        FROM rail_rides
       )r
    INNER JOIN rail_stations s ON s.station_id = r.station_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = r.ride_date
    INNER JOIN gas_prices g ON g.gas_date = r.ride_date
    INNER JOIN weather_data w ON w.weather_date = r.ride_date
    ORDER BY r.ride_date, r.station_id;
    
REFRESH MATERIALIZED VIEW Rail_Model_Data;
```

```{r}
rail_model_data <- dbGetQuery(conn, "SELECT * FROM rail_model_data")

head(rail_model_data)

model <- lm(rides ~ day_type + season + latitude + longitude + rail_line + 
                    ue_rate + gas_price + avg_temp + precipitation + snow_depth, 
            data = rail_model_data)

summary(model)
```

```{r fig3, fig.height = 6, fig.width = 10, fig.align = "center"}
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Rail Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-4000, 2000) + 
  scale_fill_manual(values = seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=45, vjust=0.95, hjust=0.95))
```


```{r}
# DISCONNECT FROM DATABASE
dbDisconnect(conn)
```

------

<br/>

## Divvy Time and Distance Calculation

<center><img src="assets/DivvyLogo.png" width="150px"/><img src="assets/r_db2.png" width="150px"/></center>

- ### <span style="color:#336791">Seamless data type integration (i.e., timestamps, coordinate decimals)</span>
- ### <span style="color:#336791">Query specific data points without large in-memory footprint</span>
- ### <span style="color:#336791">Better handle calculations and indicator conversions</span>

  
```{r}
options(connectionObserver = NULL)

library(odbc)

conn <- dbConnect(odbc::odbc(),
                  instance = config$db2_conn$instance,
                  driver = config$db2_conn$driver,
                  database = config$db2_conn$database,
                  server = config$db2_conn$server,
                  hostname = config$db2_conn$hostname,
                  port = config$db2_conn$port,
                  uid = config$db2_conn$uid,
                  pwd = config$db2_conn$pwd,
                  LongDataCompat = 1)
```

```{r}
day_df <- dbGetQuery(conn, "SELECT t.START_TIME, t.STOP_TIME,
                                   t.TRIP_DURATION
                            FROM TRIPS t
                            WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD')
                            ORDER BY t.START_TIME")

knitr::kable(head(day_df, 20))
```

```{r divvy1a, fig.height = 5, fig.width = 10, fig.align = "center"}
ggplot(day_df, aes(START_TIME, TRIP_DURATION)) + geom_line(color=seaborn_palette[1]) +
  xlab("Start Time") + ylab("Trip Duration")
```


```{r}
agg_sql <- dbGetQuery(conn, "WITH sub AS 
                                (SELECT t.FROM_STATION_ID, t.FROM_STATION_NAME,
                                        t.FROM_LATITUDE, t.FROM_LONGITUDE,
                                        ROUND(CASE 
                                                   WHEN t.TO_LATITUDE IS NOT NULL AND t.FROM_LATITUDE IS NOT NULL
                                                   THEN
                                                        3963.1 * (2 * ATAN(1) - 
                                                                  ASIN(SIN(RADIANS(t.FROM_LATITUDE)) * SIN(RADIANS(t.TO_LONGITUDE)) + 
                                                                       COS(RADIANS(t.FROM_LATITUDE)) * COS(RADIANS(t.TO_LONGITUDE)) * 
                                                                       COS(RADIANS(t.TO_LONGITUDE - t.FROM_LONGITUDE))
                                                                      ) 
                                                                 )
                                                   ELSE NULL
                                              END, 4) AS TRIP_DISTANCE
                                 FROM TRIPS t
                                 WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD'))
                            
                            SELECT FROM_STATION_ID, FROM_STATION_NAME, 
                                   FROM_LATITUDE, FROM_LONGITUDE,
                                   MIN(TRIP_DISTANCE) AS MIN_DISTANCE,
                                   MAX(TRIP_DISTANCE) AS MAX_DISTANCE
                            
                            FROM sub 
                            GROUP BY FROM_STATION_ID, FROM_STATION_NAME, 
                                     FROM_LATITUDE, FROM_LONGITUDE
                            ORDER BY MAX(TRIP_DISTANCE) DESC
                            FETCH FIRST 10 ROWS ONLY;")

knitr::kable(agg_sql)
```

### Mapping

```{r}
divvy_from <- dbGetQuery(conn, "SELECT t.FROM_STATION_NAME, 
                                       t.FROM_LATITUDE, 
                                       t.FROM_LONGITUDE,
                                       SUM(t.TRIP_DURATION) AS Total_Duration,
                                       MIN(t.TRIP_DURATION) AS Min_Duration,
                                       AVG(t.TRIP_DURATION) AS Avg_Duration,
                                       MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                       MAX(t.TRIP_DURATION) AS Max_Duration
                                       
                                FROM TRIPS t
                                GROUP BY t.FROM_STATION_NAME,
                                         t.FROM_LATITUDE, 
                                         t.FROM_LONGITUDE
                                ORDER BY SUM(t.TRIP_DURATION) DESC
                                FETCH FIRST 10 ROWS ONLY;")
knitr::kable(divvy_from)
```

```{r divvy2, fig.height = 6, fig.width = 12, fig.align = "center"}
ggplot(transform(divvy_from, FROM_STATION_NAME=gsub("&", "\n&", FROM_STATION_NAME)),
                 aes(FROM_STATION_NAME, TOTAL_DURATION, fill=FROM_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Origination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```


```{r}
divvy_to <- dbGetQuery(conn, "SELECT t.TO_STATION_NAME, 
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE,
                                     SUM(t.TRIP_DURATION) AS Total_Duration,
                                     MIN(t.TRIP_DURATION) AS Min_Duration,
                                     AVG(t.TRIP_DURATION) AS Avg_Duration,
                                     MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                     MAX(t.TRIP_DURATION) AS Max_Duration
                                   
                            FROM TRIPS t
                            WHERE TO_STATION_NAME <> 'DIVVY Map Frame B/C Station'
                            GROUP BY t.TO_STATION_NAME,
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE
                            ORDER BY SUM(t.TRIP_DURATION) DESC
                            FETCH FIRST 10 ROWS ONLY;")
knitr::kable(divvy_to)
```

```{r divvy3, fig.height = 6, fig.width = 12, fig.align = "center"}
ggplot(transform(divvy_to, TO_STATION_NAME=gsub("&", "\n&", TO_STATION_NAME)), 
       aes(TO_STATION_NAME, TOTAL_DURATION, fill=TO_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Destination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```

```{r}
library(jsonlite)

x <- toJSON(divvy_from[c("FROM_STATION_NAME", "FROM_LATITUDE", "FROM_LONGITUDE")], pretty=TRUE)

# EXPORT TO FILE
fileConn <- file("Divvy_From_Coords.json")
writeLines(x, fileConn)
close(fileConn)

x
```

```{r}
x <- toJSON(divvy_to[c("TO_STATION_NAME", "TO_LATITUDE", "TO_LONGITUDE")], pretty=TRUE)

# EXPORT TO FILE
fileConn <- file("Divvy_To_Coords.json")
writeLines(x, fileConn)
close(fileConn)

x
```

<img src="Divvy_Coords_Google_Maps.png" />

```{r}
# DISCONNECT FROM DATABASE
dbDisconnect(conn)
```


------

<br/>

## <span style="color:#336791">Metra Data Normalization and Testing</span>

<center><img src="assets/metra_app_icon.png" width="100px"><img src="assets/r_sqlite.png" width="180px"/></center>

- ### <span style="color:#336791">Data hyiene and organization</span>
- ### <span style="color:#336791">Think differently and know your data</span>
- ### <span style="color:#336791">Robust data storage with normalization and relations</span>

<center><img src="assets/Metra_OTP_Report.png" width="550px"/></center>
<center><img src="assets/Metra_OTP_Process.png" width="550px"/></center>

```{r}
library(RSQLite)

conn <- dbConnect(RSQLite::SQLite(), dbname=config$sqlite_conn$database)
```


### By Year

```{r}
sql <- "SELECT l.Line, 
               t.Line AS Short,
               CAST(strftime('%Y', Report_Month) AS INT) as Year,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.Line,
                 t.Line,
                 strftime('%Y', t.Report_Month)"

agg_sql <- dbGetQuery(conn, sql)

knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
```

```{r metra1, fig.height = 5, fig.width = 10, fig.align = "center"}
ggplot(agg_sql, aes(Year, Total_Late_Trains, color=Line)) + 
  geom_line(stat="identity") +
  labs(title="Metra Late Trains by Line", x="Year", y="Late Trains") +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_color_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```

------

## Statistical Testing

### T-Tests

```{r}
lines_list <- split(agg_sql, agg_sql$Short)

nms <- lapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) paste(i, collapse="_"))

res <- t(sapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) {
  t <- t.test(lines_list[[i[1]]]$Total_Late_Trains, lines_list[[i[2]]]$Total_Late_Trains)
  c(statistic = t$statistic, p_value = t$p.value)
}))

row.names(res) <- nms

knitr::kable(res)

```


### By Month

```{r}
sql <- "SELECT l.ID,
               l.Line, 
               t.Line AS Short,
               CAST(strftime('%m', Report_Month) AS INT) as Month_Num,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.ID,
                 l.Line,
                 t.Line,
                 CAST(strftime('%m', Report_Month) AS INT)
        ORDER BY l.ID,
                 CAST(strftime('%m', Report_Month) AS INT)"

agg_sql <- transform(dbGetQuery(conn, sql), Month = factor(month.abb[Month_Num], levels=month.abb))

knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
```


```{r metra2, fig.height = 4, fig.width = 7, fig.align = "center"}
res <- by(agg_sql, agg_sql$Line, function(sub)
  print(ggplot(sub, aes(Month, Total_Late_Trains, fill=Line)) + 
    geom_col(position = "dodge") +
    labs(title=paste(sub$Line, "-\nLate Trains by Month"), x="Year", y="Late Trains") +
    scale_x_discrete("Month", breaks=unique(agg_sql$Month)) +
    scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
    scale_fill_manual(values=seaborn_palette[sub$ID[1]]) +
    theme(legend.position="bottom",
          plot.title = element_text(hjust=0.5, size=18),
          axis.text.x = element_text(angle=0, hjust=0.5)))
)
```


### Correlations

```{r}
sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               SUM(CASE WHEN c.Late_Cause = 'Accident' THEN c.Late_Trains ELSE NULL END) AS [Accident],
               SUM(CASE WHEN c.Late_Cause = 'Catenary Failure' THEN c.Late_Trains ELSE NULL END) AS [Catenary],               
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Peak],
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Off-Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Off Peak],  
               SUM(CASE WHEN c.Late_Cause = 'Human Error' THEN c.Late_Trains ELSE NULL END) AS [Human Error],                     
               SUM(CASE WHEN c.Late_Cause = 'Lift Deployment' THEN c.Late_Trains ELSE NULL END) AS [Lift Deployment],                
               SUM(CASE WHEN c.Late_Cause = 'Locomotive Failure' THEN c.Late_Trains ELSE NULL END) AS [Locomotive Failure],              
               SUM(CASE WHEN c.Late_Cause = 'Non-Locomotive Equipment Failure' THEN c.Late_Trains ELSE NULL END) AS [Non-Locomotive Equipment Failure], 
               SUM(CASE WHEN c.Late_Cause = 'Obstruction/Debris' THEN c.Late_Trains ELSE NULL END) AS [Obstruction_Debris],              
               SUM(CASE WHEN c.Late_Cause = 'Other' THEN c.Late_Trains ELSE NULL END) AS [Other],                            
               SUM(CASE WHEN c.Late_Cause = 'Passenger Loading' THEN c.Late_Trains ELSE NULL END) AS [Passenger Loading],
               SUM(CASE WHEN c.Late_Cause = 'Passenger Train Interference' THEN c.Late_Trains ELSE NULL END) AS [Passenger Train Interference],     
               SUM(CASE WHEN c.Late_Cause = 'Sick, Injured, Unruly Passenger' THEN c.Late_Trains ELSE NULL END) AS [Sick Injured Unruly Passenger],
               SUM(CASE WHEN c.Late_Cause = 'Signal/Switch Failure' THEN c.Late_Trains ELSE NULL END) AS [Signal Switch Failure],           
               SUM(CASE WHEN c.Late_Cause = 'Track Work' THEN c.Late_Trains ELSE NULL END) AS [Track Work],                      
               SUM(CASE WHEN c.Late_Cause = 'Weather' THEN c.Late_Trains ELSE NULL END) AS [Weather]            FROM Causes c 
      GROUP BY CAST(strftime('%Y', c.Report_Month) AS INT);"

wide_sql <- dbGetQuery(conn, sql)

knitr::kable(wide_sql)
```

```{r}
knitr::kable(cor(wide_sql[-1], use = "complete.obs", method="pearson"))
```


```{r}
sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               c.Late_Cause,
               SUM(c.Late_Trains) AS Total_Late_Trains
        FROM Causes c
        INNER JOIN Lines l ON c.LineID = l.ID
        WHERE c.Late_Cause IN 
              (SELECT DISTINCT sub_c.Late_Cause 
               FROM Causes AS sub_c 
               WHERE strftime('%Y', sub_c.Report_Month) = '2012')
          AND c.Late_Cause <> 'TOTAL TRAINS DELAYED'
        GROUP BY strftime('%Y', c.Report_Month),
                 c.Late_Cause"

agg_sql <- dbGetQuery(conn, sql)

knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
```


```{r metra3, fig.height = 5, fig.width = 10, fig.align = "center"}
ggplot(subset(agg_sql, Late_Cause != 'Freight Interference - Total'),
              aes(Year, Total_Late_Trains, fill=Late_Cause)) + 
  geom_bar(position = "fill", stat = "identity") +
  labs(title="Metra Late Trains by Cause", x="Year", y="Late Trains") +
  guides(fill=guide_legend(ncol=4)) +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```

```{r}
# DISCONNECT FROM DATABASE
dbDisconnect(conn)
```


------


<br/>

## <span style="color:#336791">Conclusion</span> ##

<center><img src="assets/RDBMS_Icons.png" width="500px" /></center>  

- ### <span style="color:#336791">Relational databases provides a stable, centralized, repository for data sourcing</span> ###
- ### <span style="color:#336791">Relational databases maintains a proficient query optimizer and set-based language for data processing</span> ###
- ### <span style="color:#336791">Relational databases supports data science ensures best practices and reproducibility</span> ###

<br/>
<br/>
<br/>


## 

